from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show Code"></form>''')
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# seaborn paper context
sns.set_context("paper", font_scale=1.4)
sns.set(style="whitegrid")
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import Contours, Histogram2dContour, Marker, Scatter, Data, Layout, Figure
import plotly.tools as tools
init_notebook_mode()
# Function def: needed for using Plotly in Google Colab
# Call it in each offline plotting cell
def configure_plotly_browser_state():
import IPython
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
<script>
requirejs.config({
paths: {
base: '/static/base',
plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
},
});
</script>
'''))
def GridSearch_table_plot(grid_clf, param_name,
num_results=15,
negative=True,
graph=True,
display_all_params=True):
from matplotlib import pyplot as plt
from IPython.display import display
import pandas as pd
clf = grid_clf.best_estimator_
clf_params = grid_clf.best_params_
if negative:
clf_score = -grid_clf.best_score_
else:
clf_score = grid_clf.best_score_
clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
cv_results = grid_clf.cv_results_
print("best parameters: {}".format(clf_params))
print("best score: {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
if display_all_params:
import pprint
pprint.pprint(clf.get_params())
# pick out the best results
# =========================
scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')
best_row = scores_df.iloc[0, :]
if negative:
best_mean = -best_row['mean_test_score']
else:
best_mean = best_row['mean_test_score']
best_stdev = best_row['std_test_score']
best_param = best_row['param_' + param_name]
# display the top 'num_results' results
# =====================================
display(pd.DataFrame(cv_results) \
.sort_values(by='rank_test_score').head(num_results))
# plot the results
# ================
scores_df = scores_df.sort_values(by='param_' + param_name)
if negative:
means = -scores_df['mean_test_score']
else:
means = scores_df['mean_test_score']
stds = scores_df['std_test_score']
params = scores_df['param_' + param_name]
# plot
if graph:
plt.figure(figsize=(8, 8))
plt.errorbar(params, means, yerr=stds)
plt.axhline(y=best_mean + best_stdev, color='red')
plt.axhline(y=best_mean - best_stdev, color='red')
plt.plot(best_param, best_mean, 'or')
plt.title(param_name + " vs Score\nBest Score {:0.5f}".format(clf_score))
plt.xlabel(param_name)
plt.ylabel('Score')
plt.show()
# dal dataset ho eliminato a mano 7 righe contenenti null values
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
df = pd.read_csv("processed_cleveland.data", names=features)
print(df.info())
print(df.describe()) # description of statistic features
The 'target' field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.
Since we just want to classificate Heart Diseases Affected and Not Affected patients, we just consider:
Values 1, 2, 3, 4 as 1 ==> Heart Disease Affected Patient
Value 0 ==> Not Affected Patient.
# Target column mapping to {0, 1}
target_dict = {0 : 0, 1 : 1, 2 : 1, 3 : 1, 4 : 1}
new_target_col = df.target.map(target_dict)
df.drop(labels='target', axis="columns", inplace=True)
df['target'] = new_target_col
Once imported the dataset, let's check data distribution. The objective is to gain some insight on features importance in predicting a patient affected or will be by heart diseases.
Which are the most important features to predict a future heart disease affected patient?
The box plot is a method for graphically displaying the distribution of data based on the five number summary: minimum, first quartile, median, third quartile, and maximum.
See picture, the central rectangle spans the first quartile to the third quartile (the interquartile range or IQR).
A segment inside the rectangle shows the median while "whiskers" above and below the box show the locations of the minimum and maximum.
Not uncommonly real datasets will display surprisingly high maximums or surprisingly low minimums called outliers.
John Tukey definition for outliers:
Outliers are either 3×IQR or more above the third quartile or 3×IQR or more below the first quartile.
Suspected outliers are are slightly more central versions of outliers: either 1.5×IQR or more above the third quartile or 1.5×IQR or more below the first quartile.
If either type of outlier is present the whisker on the appropriate side is taken to 1.5×IQR from the quartile (the "inner fence") rather than the max or min, and individual outlying data points are displayed as unfilled circles (for suspected outliers) or filled circles (for outliers). (The "outer fence" is 3×IQR from the quartile.)

This feature distinguish between an healthy man and one affected by heart diseases. In the original dataset (from UCI) this was a cathegorical attribute assuming value from 0 to 4. Where 0 meant no heart diseases at all and numbers from 1 to 4 meant presence of heart diseases.
flatui_simple = ["#3498db", "#e74c3c"]
ax=sns.countplot(x='target',data=df, palette=sns.color_palette(flatui_simple));
ax.set_xticklabels(labels=['Not Affected','Affected'], fontsize=14)
ax.set_xlabel('')
ax.set_ylabel("Count",fontsize=14)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 3), textcoords = 'offset points', fontsize = 15)
affected = df[(df["target"]==1)].count()[0]
not_affected = df[(df["target"]==0)].count()[0]
tot = df.count()[0]
import matplotlib.pyplot as plt
# Pie chart
labels = ['Not Affected', 'Affected']
sizes = [not_affected/tot, affected/tot]
#colors
flatui_simple = ["#3498db", "#e74c3c"]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, colors = flatui_simple, labels=labels, autopct='%1.1f%%', startangle=90,
textprops={'fontsize': 14})
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
plt.tight_layout()
plt.show()
Note that the majority of samples in our dataset are about people who never had some kind of heart disease.
We have 160 samples of Not Affected vs 137 of Affected People. By the way, this is not a strong predominance.
sex_pal = ["#ff96ef", "#68bfff"]
ax = sns.countplot(x='sex',data=df, palette=sns.color_palette(sex_pal));
ax.set_xticklabels(labels=['Female','Male'], fontsize=14);
ax.set_xlabel("Sex",fontsize=14); ax.set_ylabel("Count",fontsize=14);
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 4), textcoords = 'offset points', fontsize = 15)
Dataset samples are for the majority relative to male patients. Male are more than twice the Female. Our data distribution is biased.
ax1 = sns.catplot(x='sex', col='target', kind='count', data=df, palette=sns.color_palette(sex_pal), estimator=lambda x: len(x));
ax1.set(xticks=[0, 1], xticklabels=['Female', 'Male']);
ax1.set_xticklabels(fontsize=14);
# 1 is for male
# 0 is for female
aff_male = df[(df["target"]==1) & (df["sex"]==1)].count()[0]
aff_female = df[(df["target"]==1) & (df["sex"]==0)].count()[0]
num_male = df[df["sex"]==1].count()[0]
num_female = df[df["sex"]==0].count()[0]
print("Total number of Male subjects is: %d" % num_male)
print("Total number of Female subjects is: %d" % num_female)
print("Number of Heart Diseases Affected male is: %d on %d" % (aff_male, num_male))
print("Number of Heart Diseases Affected female is: %d on %d" % (aff_female, num_female))
male_rate = 100*(aff_male/num_male) # affected rate for male
female_rate = 100*(aff_female/num_female) # affected rate for female
print("")
print("Percentage of affected Male: %.02f%% " % male_rate)
print("Percentage of affected Female: %.02f%% " % female_rate)
As we expected the most affected by heart diseases are Male subjects. Note that even if our dataset has more than twice male samples w.r.t female ones, more than the half man samples have some kind of heart disease.
# rug plot draws a small vertical tick at each observation
sns.distplot(df['age'], kde=True, rug=True);
Quantitative data that we have used is the age, and we are comparing it to the categorical variable target.
sns.boxplot(x='target',y='age',data=df, palette=sns.color_palette(flatui_simple));
Is interesting to note that the majority of heart diseases affected patient have an AVG age (~60)
Seems that Heart Diseases are more likely to affect people around sixty years old. From the boxplot is also evident the presence of outliers in Target=1 case.
The majority of Target-1 outliers have an age between 25 and 40 years old.
Thalach attribute represents the Maximum heart rate achieved during thalium stress test.
sns.distplot(df['thalach'], kde=True, rug=False);
# 'thalach' represents Thalium Test maximum heart rate achieved
sns.boxplot(x='target',y='thalach',data=df, palette=sns.color_palette(flatui_simple));
Resting blood pressure. Measured in (mm Hg).
sns.distplot(df['trestbps'], kde=True, rug=False);
sns.boxplot(x=df['target'], y=df['trestbps'], palette=sns.color_palette(flatui_simple));
Trestbps seems not to be an important feature to discriminate between H.D. affected and Not affected patients.
Serum cholesterol. Measured in (mg/dl).
sns.distplot(df['chol'], kde=True, rug=False);
sns.boxplot(x=df['target'], y=df['chol'], palette=sns.color_palette(flatui_simple));
From UCI Dataset description, this feature can assume 4 different values listed below:
Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
cp_unique = df.cp.unique()
cp_unique.sort()
# counting
print("'Chest Pain Type' feature unique values are: ", cp_unique)
df.cp.value_counts()
ax = sns.countplot(x='cp',data=df, palette=sns.light_palette("navy"));
ax.set_xticklabels(labels=['Typical A.','Atypical A.', 'Non-anginal P.', 'Asymptomatic']);
ax.set_xlabel("Chest Pain Type",fontsize=14); ax.set_ylabel("Count",fontsize=14);
k = sns.catplot(x='cp', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
k.set(xticks=[0, 1, 2, 3], xticklabels=['1-Typical a.', '2-Atypical a.', '3-Non-anginal', '4-Asymptomatic']);
TODO: Considerations on Chest Pain Type here!
Stands for Thalium stress test result (normal, fixed defect, or reversible defect)
Value 3: normal
Value 6: fixed defect
Value 7: reversible defect
thal_unique = df.thal.unique()
thal_unique.sort()
print("'Thal' feature unique values are: ", thal_unique)
print()
df.thal.value_counts()
n = sns.countplot(x='thal',data=df, palette=sns.light_palette("navy"));
n.set_xticklabels(labels=['Normal', 'Fixed Defect', 'Reversible Defect']);
#n.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right") # xtickslabels rotation
#n.set_xticklabels(ax.get_xticklabels(), fontsize=10) # xtickslabels font size
n.set_xlabel("Thal Test Result",fontsize=14); n.set_ylabel("Count",fontsize=14);
m = sns.catplot(x='thal', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
m.set(xticklabels=['normal', 'fixed\ndefect', 'Reversible\ndefect']);
m.set_xticklabels(fontsize=12);
It's evident from the histograms above that the majority of Heart Disease affected samples are characterized by a Thalium Stress Test Result of 'type 7' - Reversible Defect.
Fasting Blood Sugar (0 if < 120 mg/dl, 1 if > 120 mg/dl)
x = sns.catplot(x='fbs', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1], xticklabels=['<120 mg/dl', '>=120 mg/dl']);
x.set_xticklabels(fontsize=12);
Resting electrocardiographic results. Values:
Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
restecg_unique = df.restecg.unique()
restecg_unique.sort()
print("'restecg' feature unique values are: ", restecg_unique)
print()
df.restecg.value_counts()
x = sns.catplot(x='restecg', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1, 2], xticklabels=['normal', 'ST-t Wave\nabnormality', 'likely left v.\nhypertrophy']);
x.set_xticklabels(fontsize=12);
tot_abn = (df['restecg'] == 1).sum()
tot_abn_affected = ((df['target'] == 1) & (df['restecg'] == 1)).sum()
print("Number of Patient having ST-T wave abnormality: ", tot_abn)
print("Number of Patient having ST-T wave abnormality that are Affected: ", tot_abn_affected)
Note that our dataset contains only 4 samples in which Resting Electrocardiographic result (restecg) is of type 1, so having ST-T wave abnormality.
Of these 4 samples, 3 are about heart disease affected patients (75%).
Exercise induced angina (1=yes or 0=no)
x = sns.catplot(x='exang', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
x.set(xticks=[0, 1], xticklabels=['Exang=0', 'Exang=1']);
x.set_xticklabels(fontsize=12);
ST depression induced by exercise relative to rest
sns.distplot(df['oldpeak'], kde=True, rug=False);
The slope of the peak exercise ST segment. Heart Rate slope during peak exercise? Values:
Value 1: downsloping
Value 2: flat
Value 3: upsloping
l=sns.catplot(x='slope', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
l.set(xticklabels=['upsloping', 'flat', 'downsloping']);
l.set_xticklabels(fontsize=12);
When fluoroscopy is used during a cardiac catheterization, the physician can see how blood is moving through the blood vessels and where there are blockages.
Number of major vessels (0-3) colored by flourosopy
m=sns.catplot(x='ca', col='target', kind='count', data=df, palette=sns.light_palette("navy"));
m.set_xticklabels(fontsize=12);
plt.figure(figsize=(20,20))
g=sns.heatmap(df.corr(), annot = True, cmap='Blues',linewidths=.1)
plt.show()
Violin plot is a variation of box plot. It shows the distribution of quantitative data across several levels of categorical variables. The violin plot also features a kde of the underlying distribution.
Violin plot is more informative than a plain box plot. While a box plot only shows summary statistics such as mean/median and interquartile ranges, the violin plot shows the full distribution of the data.
Thalac is compared against target.
..
see: https://datavizcatalogue.com/methods/violin_plot.html
sns.violinplot(x='target', y='thalach', data=df, palette=sns.color_palette(flatui_simple));
sns.violinplot(x='target', y='trestbps', data=df, palette=sns.color_palette(flatui_simple));
sns.violinplot(x='target', y='age', data=df, palette=sns.color_palette(flatui_simple));
pd.options.display.max_colwidth = 150
features_table = pd.read_csv("features_heart.csv")
display(features_table)
As we have seen our dataset contains some categorical data. In our specific case each cat. data value represents a different category, with no ordinal relationship. Some algorithms can work directly with categorical data others don't.
e.g Decision trees can be learned directly from categorical data with no data transformation required (note that this depends on the specific implementation).
At the same time many machine learning algortihms do not handle categorical features at all. They require all input variables and output variables to be numeric. In this case we have to preprocess manually the categorical features, in order to have them in an appropriate format for the machine learning model (usually: numeric features).
Because of the lack of ordinal relationship between categories, I don't want the model to assume a natural order between categories. For this reason I've chosen to use OneHot Encoding Method.
One hot encoding is a representation of categorical variables as binary vectors.
OneHot Encoding Example:

Our dataset categorical features are:
# pandas.get_dummies() is actually OneHot encoding
categorical_features = ['cp', 'restecg', 'slope', 'thal']
for feature in categorical_features:
num_classes = len(df[feature].unique())
curr_onehot = pd.get_dummies(df[feature], prefix='en_' + feature)
df = df.drop(feature, axis=1)
df = df.join(curr_onehot)
from sklearn.preprocessing import StandardScaler
X = df[df.columns[:-1]].values # from encoded dataframe
y = df["target"].values
# Fai vedere come non normalizzando c'è outlier a (300, -10)
X_std = StandardScaler(with_mean=True, with_std=True).fit_transform(X) # Normalization: Mean=0; Variance=1
from sklearn.decomposition import PCA
import matplotlib.patches as mpatches
pca, pca1 = PCA(n_components=2), PCA(n_components=2) # all components
X_t = pca.fit_transform(X_std) # with normalization
X_t1 = pca1.fit_transform(X) # without normalization, presence of Outliers, this impact on PC1 p.v.e
# Label to color dict (manual)
label_color_dict = {0: 'blue', 1: 'red'}
# Color vector creation
cvec = [label_color_dict[label] for label in y]
# Legend
negative_patch = mpatches.Patch(color='blue', label='Not affected')
positive_patch = mpatches.Patch(color='red', label='Affected')
# 1st and 2nd Components Scatter
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(121)
ax.set_xlabel('1st PC (%.2f%%)' % (pca.explained_variance_ratio_[0] * 100))
ax.set_ylabel('2nd PC (%.2f%%)' % (pca.explained_variance_ratio_[1] * 100))
ax.set_title('PCA, on Normalized Dataset', fontsize=16)
ax.scatter(X_t[:, 0], X_t[:, 1], c=cvec, edgecolor='b', alpha=0.85, s=100)
ax.legend(handles=[negative_patch, positive_patch])
ax1 = fig.add_subplot(122)
ax1.set_xlabel('1st PC (%.2f%%)' % (pca1.explained_variance_ratio_[0] * 100))
ax1.set_ylabel('2nd PC (%.2f%%)' % (pca1.explained_variance_ratio_[1] * 100))
ax1.set_title('PCA, not Normalized Dataset', fontsize=16)
ax1.scatter(X_t1[:, 0], X_t1[:, 1], c=cvec, edgecolor='b', alpha=0.85, s=100)
ax1.legend(handles=[negative_patch, positive_patch])
plt.show()
Introduction to cross validation!
Utility Functions for decision regions plotting:
def make_meshgrid(x, y, h=.02):
"""Create a mesh of points to plot in
Parameters
----------
x: data to base x-axis meshgrid on
y: data to base y-axis meshgrid on
h: stepsize for meshgrid, optional
Returns
-------
xx, yy : ndarray
"""
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
"""Plot the decision boundaries for a classifier.
Parameters
----------
ax: matplotlib axes object
clf: a classifier
xx: meshgrid ndarray
yy: meshgrid ndarray
params: dictionary of params to pass to contourf, optional
"""
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.30, random_state=0)
print("X_train shape: ", X_train.shape);
print("X_test shape: ", X_test.shape);
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
k_values = [1, 2, 3, 10, 20, 30, 40, 50, 70, 100, 189]
#weights_values = ["uniform", "distance"]
#tuned_parameters = {'n_neighbors': k_values, 'weights': weights_values}
tuned_parameters = {'n_neighbors': k_values}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, tuned_parameters , cv=12, refit=True, return_train_score=True)
knn_cv.fit(X_train, y_train)
print('GridSearchCV 12-Fold on Training Set')
print('CV- Best Parameters are: ', knn_cv.best_params_)
# Mean cross-validated score of the best_estimator
accuracy = 100 * float(knn_cv.best_score_)
print('CV - Best Accuracy Score (AVG on all folds scores) is: %.03f%% ' % accuracy)
# best estimator is accessible by "clf.best_estimator_" if refit=TRUE
print()
print("Using best parameters retrieved by 12-fold CV on Test Set:")
y_pred = knn_cv.best_estimator_.predict(X_test)
test_score = 100 * metrics.accuracy_score(y_test, y_pred)
print("knn_cv - With best params from GridSearchCV [n_neighbors=%d]\nAccuracy Score on Test Set is %.2f%%" %
(knn_cv.best_params_['n_neighbors'], test_score))
GridSearch_table_plot(knn_cv, "n_neighbors", negative=False);
Consideration on above plot:
By increasing the number of n_neighbors considered classification perfomance of our model decreases.
At some point, depending on different class samples cardinality in our dataset, our model will always choose for the majority class, which is 'Not Affected' (160 not affected vs 137 affected).
Since Not Affected and Affected classes represent respectively ~54% and ~46% of samples, at that point ( n_neighbors >= 189 if we use 30% test split size ) our KNN model accuracy will be ~0.54%, predicting always not affected.
Number of folds for CrossValidation: 12
mean_test_score is the average of Test Scores on all k folds.
mean_train_score is the average of Train Scores on all k folds.
#['params', 'rank_test_score', 'mean_train_score', 'mean_test_score'])
df_cv = pd.DataFrame(knn_cv.cv_results_)
cv_mean_train_scores = 100*(df_cv['mean_train_score'].values) # df is actually sorted by "param_n_neighbors"
cv_mean_test_scores = 100*(df_cv['mean_test_score'].values)
df_cv = df_cv[['param_n_neighbors', 'rank_test_score','mean_train_score','mean_test_score']].sort_values(by=['rank_test_score'])
display(df_cv)
When doing k-fold cross-validation, we train k models, each one leaving 1/𝑘 portion of the data out. For each of the models, its train error and validation error is computed. The train error will be the error on the data selected to train the model, and the validation error will be the data left out of the training.
For this reason, for each parameters set, we'll have k training errors and k validation/test errors, and computing their averages: mean_train_score and mean_test_score.
**Cannot plot decision regions if we are in a space with higher dimensionality than 2d!**
import numpy as np
# Plotting decision regions
from matplotlib.colors import ListedColormap
h = .02 # step size in the mesh
# two class classification target[0, 1]
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])
x_min, x_max = X_t[:, 0].min() - 1, X_t[:, 0].max() + 1
y_min, y_max = X_t[:, 1].min() - 1, X_t[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = knn_cv.best_estimator_.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(9, 6))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot the training points
plt.scatter(X_t[:, 0], X_t[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("knn classification (k = %d)" % knn_cv.best_params_['n_neighbors'])
plt.show()
k_test_scores = [] # scores on Test Set for different K values, with 'uniform' weights
clf_vect = [] # vector containing fitted classifier for each k value
for k in k_values:
curr_clf = KNeighborsClassifier(n_neighbors=k, weights='uniform')
curr_clf.fit(X_train, y_train)
clf_vect.append(curr_clf)
y_pred = curr_clf.predict(X_test)
score = 100 * metrics.accuracy_score(y_test, y_pred)
k_test_scores.append(score)
# Plotly graph
configure_plotly_browser_state()
init_notebook_mode(connected=False)
trace0 = Scatter(
x = k_values,
y = k_test_scores,
mode = 'lines+markers',
name = '<b>Score on Test Set</b>',
line = dict(
color = ('rgb(255,0,0)'), # red
width = 4)
)
trace1 = Scatter(
x = k_values,
y = cv_mean_train_scores,
#mode = 'lines+markers',
mode = 'lines',
name = 'Mean Train Score',
line = dict(
color = ('rgb(49,130,189)'), # light blue
width = 4)
)
trace2 = Scatter(
x = k_values,
y = cv_mean_test_scores,
mode = 'lines+markers',
name = 'Mean Test Score',
line = dict(
color = ('rgb(47,79,79)'), # grey
width = 3)
)
# subplot style
fig = tools.make_subplots(rows=3, cols=1,subplot_titles=
('<b>Cross-Validation Train Scores</b>',
'<b>Cross-Validation Test Scores</b>',
'<b>Test Set Scores</b>'), shared_xaxes=True)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace0, 3, 1)
fig['layout'].update(title='KNN Scores', height=900) # in there height=600, width=600,
fig['layout']['xaxis'].update(title='n_neighbors') #shared axis X
iplot(fig, filename='stacked-knn-scores')
# Oppure: same plot
data = [trace0, trace1, trace2]
layout = dict(title = 'KNN Scores',
xaxis = dict(title = 'K value'),
yaxis = dict(title = 'Score'),
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='k_score_plot')
**Cannot plot decision regions if we are in a space with higher dimensionality than 2d!**
import numpy as np
# Plotting decision regions
from matplotlib.colors import ListedColormap
h = .02 # step size in the mesh
# Create color maps
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
# cmap_bold = ListedColormap(['#FF0000', '#0000FF', '#00FF00'])
# two class classification target[0, 1]
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])
for (k, clf) in zip(k_values, clf_vect):
# we create an instance of Neighbours Classifier and fit the data.
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_t[:, 0].min() - 1, X_t[:, 0].max() + 1
y_min, y_max = X_t[:, 1].min() - 1, X_t[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X_t[:, 0], X_t[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%d-NN Decision Boundary" % k)
K-nearest neighbors works assigning a label to a sample depending on the k nearest neighbors data points labels.
Once found the group of k nearest training samples to our target data point (can be a test sample/point) the label given to the target will be the most spread label among all the k nearest. Exactly like a majority voting!
With K increasing to infinity the margin will be all blue or all red depending on the total majority.
fonte: https://www.analyticsvidhya.com/blog/2018/03/introduction-k-neighbours-algorithm-clustering/
2.1. Feature Extraction
As the dimension (number of features) increase, it will be harder to do the data mining task such as
classification. To solve that problem, Feature Extraction namely PCA which extract original features
into a new features using mapping function is used. PCA processes include: data centering, calculate
covarian matrix, calculate Eigenvector and Eigenvalue, select top Eigenvector, and transformation
data.
2.2. Classification
Support Vector Machine (SVM) is well known as a classifier which can model complex data, has
good accuracy and less prone to overfitting. SVM works by searching the linear optimal separating
hyperplane (decision boundary). The rationale is that decision boundary with large margin is better
when handling unseen data compared to decision boundary with small margin. When the data are not
linearly separable, SVM transform original data into a higher dimension using a nonlinear mapping to
obtain the separating hyperplane.
see ref.: https://iopscience.iop.org/article/10.1088/1742-6596/971/1/012003/pdf
# Considering only first two principal components
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size=0.30, random_state=0)
print(X_train.shape); print(y_train.shape)
from sklearn import svm
C = 1.0 # SVM regularization -hyperparameter
# training different SVM models
models = (svm.SVC(kernel='linear', C=C),
svm.LinearSVC(C=C),
svm.SVC(kernel='rbf', gamma=0.7, C=C),
svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X_t, y) for clf in models)
# title for the plots
titles = ('SVC with linear kernel',
'LinearSVC (linear kernel)',
'SVC with RBF kernel',
'SVC with polynomial (degree 3)')
fig, sub = plt.subplots(2, 2, figsize=(14,9))
fig.suptitle('SVM Models Comparison', size=16) # or plt.suptitle('Main title')
plt.subplots_adjust(wspace=0.2, hspace=0.2)
X0, X1 = X_t[:, 0], X_t[:, 1]
xx, yy = make_meshgrid(X0, X1)
for clf, title, ax in zip(models, titles, sub.flatten()):
plot_contours(ax, clf, xx, yy,
cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel('Sepal length')
ax.set_ylabel('Sepal width')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
plt.show()
# Tuning Parameters
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [10, 1, 1e-2, 1e-3, 1e-4, 1e-5],
'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
{'kernel': ['sigmoid'], 'gamma': [10, 1, 1e-2, 1e-3, 1e-4, 1e-5],
'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
{'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
]
# GridSearchCV
# performing 12-fold validation
svm_cv = GridSearchCV(svm.SVC(), tuned_parameters, cv=12, refit=True, return_train_score=True)
svm_cv.fit(X_train, y_train)
print('GridSearchCV 12-Fold on Training Set.\nBest_params= %s with best_score_= %.03f'%
(svm_cv.best_params_, 100*float(svm_cv.best_score_)))
#['params', 'rank_test_score', 'mean_train_score', 'mean_test_score'])
df_svm_cv = pd.DataFrame(svm_cv.cv_results_)
# selecting only the interesting columns
df_svm_cv = df_svm_cv[['params', 'rank_test_score','mean_train_score','mean_test_score']].sort_values(by=['rank_test_score'])
# cv_svm_train_scores = 100*(df_svm_cv['mean_train_score'].values)
# cv_sv_test_scores = 100*(df_svm_cv['mean_test_score'].values)
df_svm_cv.head()
# best estimator is accessible by "clf2.best_estimator_" if refit=TRUE
y_pred = svm_cv.best_estimator_.predict(X_test)
score = 100 * metrics.accuracy_score(y_test, y_pred)
print("SVM CV - best params from GridSearchCV are: [Kernel: %s, C: %.3f , gamma: %.3f]\nAccuracy Score on Test Set is: %.2f%%" %
(svm_cv.best_params_['kernel'], svm_cv.best_params_['C'], svm_cv.best_params_['gamma'], score))
# Plotting decision boundaries:
X0, X1 = X_train[:, 0], X_train[:, 1]
xx, yy = make_meshgrid(X0, X1)
fig2 = plt.figure(2, figsize=(10, 7))
ax3 = fig2.add_subplot(111)
plot_contours(ax3, svm_cv.best_estimator_, xx, yy, # using Best estimator
cmap=plt.cm.coolwarm, alpha=0.8)
ax3.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax3.set_xlim(xx.min(), xx.max())
ax3.set_ylim(yy.min(), yy.max())
# scores x label
ax3.set_xlabel('Sepal length\nMean Test Score (12-fold): %.03f ; Score on Test Set: %.03f' % (svm_cv.best_score_, score), fontsize=13)
ax3.set_ylabel('Sepal width', fontsize=13)
ax3.set_xticks(())
ax3.set_yticks(())
ax3.set_title(
'GridSearchCV %d-Fold Validation\n Kernel: %s, C = %.02f, gamma = %.02f' % (12, svm_cv.best_params_['kernel'] ,svm_cv.best_params_['C'], svm_cv.best_params_['gamma']), fontsize=15);
Considerations
df_svm_cv # cv parameters combination
Estimator ranked one on Mean Test Score:
63 {'C': 1000, 'gamma': 1e-05, 'kernel': 'sigmoid'} 1 0.884061 0.898551
12 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} 1 0.888898 0.898551
22 {'C': 50, 'gamma': 0.0001, 'kernel': 'rbf'} 1 0.884061 0.898551
58 {'C': 100, 'gamma': 0.0001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
45 {'C': 10, 'gamma': 0.001, 'kernel': 'sigmoid'} 1 0.884061 0.898551
66 {'C': 0.1, 'kernel': 'linear'} 6 0.886700 0.893720
m = []
t = []
m.append(svm.SVC(kernel='rbf', C=10, gamma = 10))
t.append("C: 10, gamma: 10, kernel: rbf")
m.append(svm.SVC(kernel='rbf', C=50, gamma = 0.1))
t.append("C: 50, gamma: 0.0001, kernel: rbf")
m.append(svm.SVC(kernel='sigmoid', C=1000, gamma = 1e-05))
t.append("C: 1000, gamma: 1e-05, kernel: sigmoid")
m.append(svm.SVC(kernel='sigmoid', C=100, gamma = 0.0001))
t.append("C: 100, gamma: 0.0001, kernel: sigmoid")
m.append(svm.SVC(kernel='sigmoid', C=10, gamma = 0.1))
t.append("C: 10, gamma: 0.1, kernel: sigmoid")
m.append(svm.SVC(kernel='sigmoid', C=10, gamma = 1))
t.append("C: 10, gamma: 0.1, kernel: sigmoid")
m.append(svm.SVC(kernel='linear', C=0.1))
t.append("C: 0.1, kernel: linear")
m.append(svm.SVC(kernel='linear', C=1))
t.append("C: 1, kernel: linear")
m.append(svm.SVC(kernel='linear', C=10))
t.append("C: 10, kernel: linear")
for m_value, t_value in zip(m,t):
curr_clf = m_value
curr_clf.fit(X_train, y_train)
curr_pred = curr_clf.predict(X_test)
score = 100 * metrics.accuracy_score(y_test, curr_pred)
X0, X1 = X_train[:, 0], X_train[:, 1]
xx, yy = make_meshgrid(X0, X1)
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111)
plot_contours(ax, curr_clf, xx, yy,
cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
# scores x label
ax.set_xlabel(
'Sepal length\nScore on Test Set: %.03f' % (score),
fontsize=13)
ax.set_ylabel('Sepal width', fontsize=13)
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(
t_value , fontsize=16)
plt.show()
i += 1
i = 1
for C_value in C:
for gamma_value in gamma:
for kernel_value in kernel:
curr_clf = svm.SVC(kernel='rbf', gamma=gamma_value, C=C_value)
curr_clf.fit(X_train, y_train)
curr_pred = curr_clf.predict(X_test)
score = 100 * metrics.accuracy_score(y_test, curr_pred)
X0, X1 = X_train[:, 0], X_train[:, 1]
xx, yy = make_meshgrid(X0, X1)
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111)
plot_contours(ax, curr_clf, xx, yy,
cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
# scores x label
ax.set_xlabel(
'Sepal length\nScore on Test Set: %.03f' % (score),
fontsize=13)
ax.set_ylabel('Sepal width', fontsize=13)
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(
'Plot %d: SVM - RBF with C=%.02f, gamma=%.03f' % (i, C_value, gamma_value), fontsize=16)
plt.show()
i += 1
Considerations on decision boundaries! TO DO!